#!/usr/bin/env python
# coding=utf-8
# __author__ = 'Yunchao Ling'

from bs4 import BeautifulSoup
import ParseCommon


def getStudies(filepath):
    studies = []
    soup = BeautifulSoup(open(filepath), 'lxml-xml')
    studies_soup = soup.find_all("STUDY")
    if len(studies_soup) > 0:
        for study_soup in studies_soup:
            studies.append(getStudy(study_soup))
    return studies


def getStudy(study_soup):
    study = {}
    identifiers = study_soup.IDENTIFIERS
    if identifiers != None:
        ids = ParseCommon.getIdentifiers(identifiers)
        study['proj_no'] = ids['PRIMARY_ID']['id']
        study['backup'] = ids
    descriptor = study_soup.DESCRIPTOR
    title = descriptor.STUDY_TITLE.string
    study['name'] = title
    study_type_soup = descriptor.STUDY_TYPE
    if study_type_soup != None:
        study_type = study_type_soup.get('new_study_type')
        if study_type != None:
            study['backup']['study_type'] = study_type
        else:
            study_type = study_type_soup.get('existing_study_type')
            study['backup']['study_type'] = study_type
    study_abstract_soup = descriptor.STUDY_ABSTRACT
    if study_abstract_soup != None:
        study['description'] = study_abstract_soup.string
    return study


if __name__ == "__main__":
    filepath = "D:/data/SRA000001/SRA000001.study.xml"
    print getStudies(filepath)
